import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import model_evaluation_utils as meu
import matplotlib.pyplot as plt
from collections import Counter
import shap
import eli5

plt.style.use('fivethirtyeight')
%matplotlib inline

shap.initjs();


# Olvassuk be az adatokat
data, labels = shap.datasets.adult(display=True)

# A célváltozó True/False értékeket tartalmaz, ezt konvertáljuk 1/0 értékekre
labels = np.array([int(label) for label in labels])

# Az adatszet mérete
data.shape, labels.shape

((32561, 12), (32561,))


data.head(3)


data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   Age             32561 non-null  float32 
 1   Workclass       32561 non-null  category
 2   Education-Num   32561 non-null  float32 
 3   Marital Status  32561 non-null  category
 4   Occupation      32561 non-null  category
 5   Relationship    32561 non-null  category
 6   Race            32561 non-null  category
 7   Sex             32561 non-null  category
 8   Capital Gain    32561 non-null  float32 
 9   Capital Loss    32561 non-null  float32 
 10  Hours per week  32561 non-null  float32 
 11  Country         32561 non-null  category
dtypes: category(7), float32(5)
memory usage: 862.0 KB


cat_cols = data.select_dtypes(['category']).columns
data[cat_cols] = data[cat_cols].apply(lambda x: x.cat.codes)
data.head()


Counter(labels)

Counter({0: 24720, 1: 7841})


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.3, random_state=42)
X_train.shape, X_test.shape

((22792, 12), (9769, 12))


data_disp, labels_disp = shap.datasets.adult(display=True)
X_train_disp, X_test_disp, y_train_disp, y_test_disp = train_test_split(data_disp, labels_disp, test_size=0.3, random_state=42)
X_train_disp.shape, X_test_disp.shape

((22792, 12), (9769, 12))


X_train.head(3)


X_train_disp.head(3)


import xgboost as xgb

xgc = xgb.XGBClassifier(n_estimators=500, max_depth=5, base_score=0.5, n_jobs = -2,
                        objective='binary:logistic', random_state=42, eval_metric='mlogloss')
xgc.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='mlogloss',
              feature_types=None, gamma=None, gpu_id=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=5,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, n_estimators=500, n_jobs=-2,
              num_parallel_tree=None, predictor=None, random_state=42, ...)

XGBClassifier(base_score=0.5, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='mlogloss',
              feature_types=None, gamma=None, gpu_id=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=5,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, n_estimators=500, n_jobs=-2,
              num_parallel_tree=None, predictor=None, random_state=42, ...)


predictions = xgc.predict(X_test)
predictions[:10]

array([0, 1, 1, 0, 0, 1, 1, 0, 0, 1])


class_labels = list(set(labels))
meu.display_model_performance_metrics(true_labels=y_test, predicted_labels=predictions, classes=class_labels)

Model Performance metrics:
------------------------------
Accuracy: 0.8662
Precision: 0.8623
Recall: 0.8662
F1 Score: 0.8635

Model Classification report:
------------------------------
              precision    recall  f1-score   support

           0       0.90      0.93      0.91      7455
           1       0.74      0.66      0.70      2314

    accuracy                           0.87      9769
   macro avg       0.82      0.80      0.81      9769
weighted avg       0.86      0.87      0.86      9769


Prediction Confusion Matrix:
------------------------------
          Predicted:      
                   0     1
Actual: 0       6925   530
        1        777  1537


fig = plt.figure(figsize = (16, 12))
title = fig.suptitle("Default Feature Importances from XGBoost", fontsize=14)

ax1 = fig.add_subplot(2,2, 1)
xgb.plot_importance(xgc, importance_type='weight', ax=ax1)
t=ax1.set_title("Feature Importance - Feature Weight")

ax2 = fig.add_subplot(2,2, 2)
xgb.plot_importance(xgc, importance_type='gain', ax=ax2)
t=ax2.set_title("Feature Importance - Split Mean Gain")

ax3 = fig.add_subplot(2,2, 3)
xgb.plot_importance(xgc, importance_type='cover', ax=ax3)
t=ax3.set_title("Feature Importance - Sample Coverage")


from sklearn.inspection import PartialDependenceDisplay

fig, ax = plt.subplots(1, 3, figsize=(10, 3))
PartialDependenceDisplay.from_estimator(xgc, X_train[:10000], ['Age', 'Education-Num', ('Age', 'Education-Num')], ax=ax, n_cols=3)
fig.suptitle('1D & 2D Partial Dependence Plots')
fig.tight_layout();


fig, ax = plt.subplots(1, 2, figsize=(6, 3))
PartialDependenceDisplay.from_estimator(xgc, X_train[:100], ['Age', 'Education-Num'], ax=ax, n_cols=2, kind='individual')
fig.suptitle('1D ICE Plots')
fig.tight_layout();


eli5.show_weights(xgc.get_booster())


doc_num = 0
print('Actual Label:', y_test[doc_num])
print('Predicted Label:', predictions[doc_num])
eli5.show_prediction(xgc.get_booster(), X_test.iloc[doc_num], feature_names=list(data.columns), show_feature_values=True)

Actual Label: 0
Predicted Label: 0


doc_num = 2
print('Actual Label:', y_test[doc_num])
print('Predicted Label:', predictions[doc_num])
eli5.show_prediction(xgc.get_booster(), X_test.iloc[doc_num], feature_names=list(data.columns), show_feature_values=True)

Actual Label: 1
Predicted Label: 1


from skater.core.explanations import Interpretation
from skater.model import InMemoryModel


# Interpretation objektum létrehozása
interpreter = Interpretation(training_data=X_test, feature_names=list(data.columns))

# In-memory modell definiálása
im_model = InMemoryModel(xgc.predict_proba, examples=X_train, target_names=['$50K or less', 'More than $50K'])


plots = interpreter.feature_importance.plot_feature_importance(im_model, ascending=False)

[12/12] features ████████████████████ Time elapsed: 1 seconds


r = interpreter.partial_dependence.plot_partial_dependence(['Age'], im_model, grid_resolution=50, 
                                                           grid_range=(0,1), n_samples=10000, 
                                                           with_variance=True, figsize = (6, 4))
yl = r[0][1].set_ylim(0, 1)

[44/44] grid cells ████████████████████ Time elapsed: 30 seconds


r = interpreter.partial_dependence.plot_partial_dependence(['Education-Num'], im_model, grid_resolution=50, 
                                                           grid_range=(0,1), n_samples=23000, 
                                                           with_variance=True, figsize = (6, 4))
yl = r[0][1].set_ylim(0, 1)

[16/16] grid cells ████████████████████ Time elapsed: 28 seconds


plots_list = interpreter.partial_dependence.plot_partial_dependence([('Age', 'Education-Num')], 
                                                                    im_model, figsize=(12, 5), grid_resolution=100)

[1136/1136] grid cells ████████████████████ Time elapsed: 47 seconds


from skater.core.local_interpretation.lime.lime_tabular import LimeTabularExplainer

exp = LimeTabularExplainer(X_test.values, feature_names=list(data.columns), 
                           discretize_continuous=True, 
                           class_names=['$50K or less', 'More than $50K'])


doc_num = 0
print('Actual Label:', y_test[doc_num])
print('Predicted Label:', predictions[doc_num])
exp.explain_instance(X_test.iloc[doc_num].values, xgc.predict_proba).show_in_notebook()

Actual Label: 0
Predicted Label: 0


doc_num = 2
print('Actual Label:', y_test[doc_num])
print('Predicted Label:', predictions[doc_num])
exp.explain_instance(X_test.iloc[doc_num].values, xgc.predict_proba).show_in_notebook()

Actual Label: 1
Predicted Label: 1


explainer = shap.TreeExplainer(xgc)
shap_values = explainer.shap_values(X_test)


pd.DataFrame(shap_values).head()


print('Expected Value:', explainer.expected_value)

Expected Value: -1.439313


shap.force_plot(explainer.expected_value, shap_values[0,:], X_test_disp.iloc[0,:])


shap.force_plot(explainer.expected_value, shap_values[2,:], X_test_disp.iloc[2,:])


shap.force_plot(explainer.expected_value, shap_values[:1000,:], X_test_disp.iloc[:1000,:])


shap.decision_plot(explainer.expected_value, shap_values[2],link='logit', 
                   features=X_test_disp.iloc[2,:], 
                   feature_names=(X_test_disp.columns.tolist()), 
                   show=True, title="Decision Plot")


shap.summary_plot(shap_values, X_test, plot_type="bar")


shap.summary_plot(shap_values, X_test)


shap.dependence_plot(ind='Age', interaction_index='Age',
                     shap_values=shap_values, 
                     features=X_test,  
                     display_features=X_test_disp)


shap.dependence_plot(ind='Education-Num', interaction_index='Education-Num',
                     shap_values=shap_values, 
                     features=X_test,  
                     display_features=X_test_disp)


import explainerdashboard as expdb
from explainerdashboard import ClassifierExplainer, ExplainerDashboard
from explainerdashboard import InlineExplainer
from explainerdashboard.custom import (ImportancesComposite,
                                       IndividualPredictionsComposite,
                                       WhatIfComposite,
                                       ShapDependenceComposite,
                                       ShapInteractionsComposite,
                                       DecisionTreesComposite)

# Create the explainer object
explainer = ClassifierExplainer(xgc, X_test, y_test, model_output='logodds')

# Create individual component plants using Inexplainer

ie = InlineExplainer(explainer)

# SHAP overview
ie.shap.overview()

Detected XGBClassifier model: Changing class type to XGBClassifierExplainer...
Generating self.shap_explainer = shap.TreeExplainer(model)
Calculating shap values...
Dash is running on http://127.0.0.1:8050/


ie.shap.interaction_dependence()

Dash is running on http://127.0.0.1:8050/


ie.shap.contributions_graph()

Dash is running on http://127.0.0.1:8050/


ie.shap.dependence()

Dash is running on http://127.0.0.1:8050/


# Model Stats
ie.classifier.model_stats()

Dash is running on http://127.0.0.1:8050/


db = ExplainerDashboard(explainer, 
                        title="Census Data",
                        shap_interaction=False, # you can switch off tabs with bools
                        )
db.run(port=8050)

Building ExplainerDashboard..
Detected notebook environment, consider setting mode='external', mode='inline' or mode='jupyterlab' to keep the notebook interactive while the dashboard is running...
Generating layout...
Generating xgboost model dump...
Calculating dependencies...
Calculating permutation importances (if slow, try setting n_jobs parameter)...
Calculating pred_percentiles...
Calculating predictions...
Calculating ShadowDecTree for each individual decision tree...
Reminder: you can store the explainer (including calculated dependencies) with explainer.dump('explainer.joblib') and reload with e.g. ClassifierExplainer.from_file('explainer.joblib')
Registering callbacks...
Starting ExplainerDashboard on http://192.168.1.110:8050
Dash is running on http://0.0.0.0:8050/

	Age	Workclass	Education-Num	Marital Status	Occupation	Relationship	Race	Sex	Capital Gain	Hours per week	Country
0	39.0	State-gov	13.0	Never-married	Adm-clerical	Not-in-family	White	Male	2174.0	40.0	United-States
1	50.0	Self-emp-not-inc	13.0	Married-civ-spouse	Exec-managerial	Husband	White	Male	0.0	13.0	United-States
2	38.0	Private	9.0	Divorced	Handlers-cleaners	Not-in-family	White	Male	0.0	40.0	United-States

	Age	Workclass	Education-Num	Marital Status	Occupation	Relationship	Race	Sex	Capital Gain	Capital Loss	Hours per week	Country
19749	34.0	Self-emp-not-inc	9.0	Married-civ-spouse	Farming-fishing	Wife	White	Female	0.0	2179.0	12.0	United-States
1216	48.0	Self-emp-not-inc	10.0	Married-civ-spouse	Craft-repair	Husband	Amer-Indian-Eskimo	Male	7688.0	0.0	40.0	United-States
27962	23.0	State-gov	10.0	Married-civ-spouse	Prof-specialty	Husband	White	Male	0.0	0.0	30.0	United-States

Contribution^?	Feature	Value
+0.406	Education-Num	10.000
+0.011	Country	39.000
-0.006	Race	4.000
-0.040	Capital Loss	0.000
-0.044	Workclass	4.000
-0.136	Capital Gain	0.000
-0.139	Sex	0.000
-0.527	Relationship	1.000
-0.659	Occupation	1.000
-1.169	Age	27.000
-1.439	<BIAS>	1.000
-1.579	Marital Status	0.000
-1.971	Hours per week	38.000

Contribution^?	Feature	Value
+1.613	Education-Num	13.000
+0.542	Occupation	4.000
+0.404	Marital Status	2.000
+0.345	Relationship	0.000
+0.102	Hours per week	55.000
+0.099	Race	2.000
+0.069	Sex	1.000
+0.054	Workclass	4.000
-0.038	Country	39.000
-0.132	Capital Loss	0.000
-0.213	Capital Gain	0.000
-1.079	Age	29.000
-1.439	<BIAS>	1.000

	0	1	2	3	4	5	6	7	8	9	10	11
0	-1.656668	-0.018396	0.159149	-1.467291	-0.729105	-0.543311	-0.012643	-0.249136	-0.141240	-0.037275	-1.153366	-0.003618
1	0.460356	-0.225294	-0.461344	0.704099	0.349603	1.291561	0.006843	-0.076172	-0.111211	-0.055266	-0.081491	0.029887
2	-0.949817	-0.055998	1.310552	0.420468	0.554448	0.508344	0.021364	0.058665	-0.178827	-0.100475	0.155954	0.022035
3	-0.365298	0.080139	0.547560	-1.241316	-0.468366	-0.555728	0.039083	-0.358992	-0.153173	-0.054502	-0.242371	0.049659
4	-0.335611	-0.082519	-0.237704	-0.741135	0.159709	-0.235818	-0.062805	0.027067	-5.222733	-0.023092	0.944700	-0.016303

Modellek eredményeinek értelmezése¶

Bemutatkozás¶

Bevezetés: értelmezhető vs. magyarázható¶

Explainable AI¶

Üzleti probléma és az Adatok¶

Klasszikus módszerek¶

Hagyományos technikák¶

Változók fontossága (feature importance)¶

Partial Dependence Plot¶

Individual conditional expectation (ICE)¶

Globális és lokális értelmezést biztosító technikák¶

ELI5¶

Globális értelmezés¶

Lokális értelmezés¶

Skater¶

Globális értelmezés¶

Feature Importance¶

Partial Dependence Plot¶

Lokális értelmezés¶

LIME¶

SHAP¶

Lokális értelmezés¶

Force Plot¶

Decision Plot¶

Globális értelmezés¶

SHAP Feature Importance (summary bar plot)¶

SHAP Summary Plot¶

SHAP Dependence Plot¶

Más segédeszközök¶

Dasboard-ok¶

ExplainerDashboard¶

Egyéb dashboard csomagok:¶

Gyakorló feladat¶

Köszönöm a figyelmet!¶

Weight	Feature
0.3225	Relationship
0.1564	Capital Gain
0.1485	Marital Status
0.0919	Education-Num
0.0731	Capital Loss
0.0407	Sex
0.0362	Occupation
0.0339	Age
0.0294	Hours per week
0.0273	Workclass
0.0223	Race
0.0178	Country